# Replication materials for Clause Analysis (Van Atteveldt et al., Political Analysis)
# Auxilliary functions

library(irr)
library(reshape2)

#' Compute subject/object/source triples from clauses containing aggression
get.aggressions <- function(tokens, quotes, clauses) {
  clauses$sentence = tokens$sentence[match(clauses$id, tokens$id)]
  clauses$actor = tokens$actor[match(clauses$id, tokens$id)]
  subjects = with(subset(clauses, clause_role == "subject" & !is.na(actor)), unique(data.frame(clause_id=clause_id, subject=actor)))
  objects = with(subset(clauses, clause_role == "predicate" & !is.na(actor)), unique(data.frame(clause_id=clause_id, object=actor)))
  
  # remove objects that are also subject in the same clause
  su = paste(subjects$clause_id, subjects$subject)
  ob =   paste(objects$clause_id, objects$object)
  objects = objects[!(ob %in% su),]
  
  # get all aggressions per sentence, identify actor in source, subject, predicate
  attack = tokens$id[tokens$attack]
  
  agcl = data.frame(clause_id = unique(clauses$clause_id[clauses$id %in% attack]))
  agcl$sentence = clauses$sentence[match(agcl$clause_id, clauses$clause_id)]
  agcl = merge(agcl, subjects, all.x=T)
  agcl = merge(agcl, objects, all.x=T)
  agcl = agcl[!is.na(agcl$subject) | !is.na(agcl$object), ]

  agcl
    
  na.to.blank = function(x) {x = as.character(x); x[is.na(x)] = ""; as.factor(x)}
  
  agcl$subject = na.to.blank(agcl$subject)
  agcl$object = na.to.blank(agcl$object)
  
  
  # get sources
  clausequotes = merge(clauses, quotes[quotes$quote_role == "quote", ])
  clausequotes = unique(clausequotes[c("clause_id", "quote_id")])
  
  # sanity check: max 1 quote per clause
  if (with(clausequotes, length(clause_id) != length(unique(clause_id)))) {
    nn = table(clausequotes$clause_id)
    dupes = names(nn)[nn > 1]
    stop("Multiple sources for clause(s) ", dupes)
  }
  
  sources = merge(quotes[quotes$quote_role == "source",c("id", "quote_id")], tokens[c("id", "actor")])
  sources = merge(clausequotes, sources, all.x=T)
  
  src_actors = unique(na.omit(sources)[c("clause_id", "actor")])
  others = unique(sources$clause_id[!(sources$clause_id %in% src_actors$clause_id)])
  if (length(others) > 0) src_actors = rbind(src_actors, data.frame(clause_id=others, actor="other"))
  # sanity check: one actor per source
  if (!length(src_actors$clause_id) == length(unique(src_actors$clause_id))) {
    nn = table(src_actors$clause_id)
    dupes = names(nn)[nn > 1]
    print(dupes)
    stop("Multiple actors for sources of clause ", dupes)
  }
  
  agcl = merge(agcl, rename(src_actors, c(actor="source")), all.x=T)
  agcl$source[is.na(agcl$source)] = ""
  agcl
}

#' Compare the subject/object of gold vs actual triples
compare <- function(gold, actual) {
  gold = subset(gold, subject != "" | object != "")
  actual = subset(actual, subject != "" | object != "")
  
  gold$f = T
  actual$f = T
  combined = merge(gold, actual, by=c("sentence" ,"subject", "object"), all=T)
  combined$class= "TP"
  combined$class[is.na(combined$f.x)] = "FP"
  combined$class[is.na(combined$f.y)] = "FN"
  combined$f.x=combined$f.y=NULL
  combined
  tp = sum(combined$class == "TP")
  
  #a = with(gold, paste(sentence, subject, object, sep="_"))
  #b = with(actual, paste(sentence, subject, object, sep="_"))
  #tp = intersect(a, b)
  pr = tp/nrow(actual)
  re=tp/nrow(gold)
  message("Precision:", round(pr, 2))
  message("Recall:", round(re, 2))
  message("f1: ", round((2*pr*re)/(pr+re), digits = 2))
  
  invisible(combined)
}

#' Compare the source of gold vs actual triples
compare.sources <- function(gold, actual) {
  gold.sources = unique(gold[c("sentence", "source")])
  actual.sources = unique(actual[c("sentence", "source")])
  
  .check_unique <- function(x, name="id") {
    # sanity check: max 1 quote per clause
    if (length(x) != length(unique(x))) {
      nn = table(x)
      dupes = names(nn)[nn > 1]
      stop("Multiple sources for ", name, ": ", paste(dupes))
    }
  }
  .check_unique(gold.sources$sentence, "gold sentence")
  .check_unique(actual.sources$sentence, "actual sentence")
  
  gold.sources$g = T
  actual.sources$a = T
  
  x = merge(actual.sources, gold.sources, all=T)
  x[is.na(x)] = F
  x = x[x$source != "",]
  head(x)
  x$class = "TP"
  x$class[!x$g] = "FP"
  x$class[!x$a] = "FN"
  x$a=x$g=NULL
  nn = table(x$class)
  pr = nn["TP"] / (nn["TP"] + nn["FP"])
  re = nn["TP"] / (nn["TP"] + nn["FN"])
  message("precision: ", round(pr, digits = 2))
  message("recall: ", round(re, digits = 2))
  message("f1: ", round((2*pr*re)/(pr+re), digits = 2))
  
  invisible(x)
}

#' Compute word-order baseline subject/object
get.baseline <- function(tokens) {
  tokens$actor[tokens$hamas] = "Hamas"
  tokens$actor[tokens$israel] = "Israel"
  actors = aggregate(tokens["id"], tokens[c("sentence", "actor")], min)
  actors = actors[actors$actor != "",]
  actors = dcast(actors, sentence ~ actor, value.var = "id")
  actors$subject[is.na(actors$Hamas)] = "israel"
  actors$subject[is.na(actors$Israel)] = "hamas"
  both = !is.na(actors$Hamas) & !is.na(actors$Israel)
  actors$subject[both & actors$Hamas > actors$Israel] = "israel"
  actors$object[both & actors$Hamas > actors$Israel] = "hamas"
  
  actors$subject[both & actors$Hamas < actors$Israel] = "hamas"
  actors$object[both & actors$Hamas < actors$Israel] = "israel"
  actors
}

#' Compute word-order baseline sources
get.baseline.sources <- function(tokens) {
  actors = get.baseline(tokens)
  
  says = subset(tokens, say)
  says = aggregate(list(say=says$id), says["sentence"], min)
  
  sources = merge(actors, says)
  sources$minactor = apply(sources[c("Hamas", "Israel")], MARGIN = 1, min, na.rm=T)
  
  sources$source = "other"
  sources$source[sources$minactor < sources$say] = sources$subject[sources$minactor < sources$say]
  sources = unique(sources[c("sentence", "source")])
  if (length(sources$sentence) != length(unique(sources$sentence))) stop("Oeps")
  
  # add 'other'
  rbind(sources, data.frame(sentence=actors$sentence[!(actors$sentence %in% sources$sentence)], source=""))
}

merge_incomplete = function(ai) {
  sis = ai$sentence[ai$subject == "israel" & ai$object == ""]
  sho = ai$sentence[ai$object == "hamas" & ai$subject == ""]
  to_merge = intersect(sis, sho)
  ai = ai[!(ai$sentence %in% to_merge & ai$subject == "israel" & ai$object == ""),]
  ai = ai[!(ai$sentence %in% to_merge & ai$subject == "" & ai$object == "hamas"),]
  ai = rbind(ai, data.frame(clause_id=NA, sentence=to_merge, subject="israel", object="hamas", source=NA))
  
  shs = ai$sentence[ai$subject == "hamas" & ai$object == ""]
  sio = ai$sentence[ai$object == "israel" & ai$subject == ""]
  to_merge = intersect(shs, sio)
  ai = ai[!(ai$sentence %in% to_merge & ai$subject == "hamas" & ai$object == ""),]
  ai = ai[!(ai$sentence %in% to_merge & ai$subject == "" & ai$object == "israel"),]
  ai = rbind(ai, data.frame(clause_id=NA, sentence=to_merge, subject="hamas", object="israel", source=NA))
  ai
}


#' Get and normalize named entities for explorative graphs
get.entities <- function(tokens) {
  
  entities = tokens$entity
  lagentity = c("", as.character(entities[-length(entities)]))
  x = cbind(entities, lagentity)
  entity_id = rep(NA, length(entities))
  entity_id[entities != "" & entities != lagentity] = tokens$id[entities != "" & entities != lagentity]
  entity_id = zoo::na.locf(entity_id)
  entity_id[entities == ""] = NA
  
  entities = aggregate(tokens$lemma, by=list(id=entity_id), FUN=paste, collapse="_")
  colnames(entities) = c("id", "name")
  entities$type = tokens$entity[match(entities$id, tokens$id)]
  
  entities = subset(entities, type %in% c("ORGANIZATION", "PERSON", "LOCATION"))
  
  entities$name[entities$name == "United_States"] = "U.S."
  entities$name[entities$name %in% c("U.N.", "United_Nations")] = "UN"
  entities$name[entities$name %in% c("UN_security_Council","")] = "Security_Council"
  entities$name[entities$name == "Gaza_Strip"] = "Gaza"
  entities$name[entities$name %in% c("Israeli", "israeli")] = "Israel"
  entities$name[tolower(entities$name) %in% c("idf")] = "IDF"
  entities$name[entities$name %in% c("Ehud_Olmert")] = "Olmert"
  entities$name[entities$name %in% c("Ehud_Barak")] = "Barak"
  entities$name[entities$name %in% c("Mark_Regev")] = "Regev"
  entities$name[entities$name %in% c("Barack_Obama")] = "Obama"
  entities$name[entities$name %in% c("Tzipi_Livni")] = "Livni"
  entities$name[entities$name %in% c("Gordon_Johndroe")] = "Johndroe"
  entities$name[entities$name %in% c("Condoleezza_Rice", "Condoleeza_Rice")] = "Rice"
  entities$name[grepl("hassanein", entities$name, ignore.case = T)] = "Hassanein"
  entities$name[grepl("assad$", entities$name, ignore.case = T)] = "Assad"
  entities$name[grepl("ahmadinej", entities$name, ignore.case = T)] = "Ahmadinejad"
  
  capwords <- function(s, strict = FALSE) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)),
{s <- substring(s, 2); if(strict) tolower(s) else s},
sep = "", collapse = " " )
sapply(strsplit(s, split = "_"), cap, USE.NAMES = !is.null(names(s)))
  }

entities$name = capwords(entities$name)
entities
}

plot.cooc <- function(dtm, max.vertices=25, doplot=T, ...) {
  n = coOccurenceNetwork(dtm)
  g = semnet::getBackboneNetwork(n, max.vertices = max.vertices, alpha = .1)
  V(g)$cluster = edge.betweenness.community(g, directed=F)$membership
  g = setNetworkAttributes(g, V(g)$freq*2, V(g)$cluster)
  V(g)$label.cex = V(g)$label.cex * 1.5
  V(g)$shape = "none"
  if (doplot) plot(g, ...)   
  g
}
plot.cooc.from.dtm <- function(dtm, target.rows, ref.rows, filename=NULL, ...) {
  trg = dtm[rownames(dtm) %in% target.rows, ]
  ref = dtm[rownames(dtm) %in% ref.rows, ]
  ph = corpora.compare(trg, ref)
  if (!is.null(filename)) png(filename, 500, 500)
  g = plot.cooc(trg[,colnames(trg) %in% ph$term[ph$over > 1 & ph$chi>5]], ...)
  if (!is.null(filename)) dev.off()
  g
}

graph.settings <- function(g, backbone=F) {
  g=simplify(g, remove.multiple = F)
  E(g)$weight = E(g)$n
  if (backbone) g = semnet::getBackboneNetwork(g, max.vertices = 25, alpha = .1, direction = 'out')
  V(g)$cluster = edge.betweenness.community(g, directed=F)$membership
  g = setNetworkAttributes(g, V(g)$weight*2, V(g)$cluster)
  V(g)$label.cex = V(g)$label.cex * 1.5
  E(g)$arrow.size = .5
  E(g)$curved = .3
  g
}
